/**
* Author: WuLC
* Date:   2016-05-25 09:18:09
* Last modified by:   WuLC
* Last Modified time: 2016-05-25 14:50:52
* Email: liangchaowu5@gmail.com
******************************************************
* Function: combine TextRank and TF-IDF to extract keywords 
* Input: path of the directory of the corpus
* Output: keywords extracted for each document
*/

package com.lc.nlp.keyword.algorithm;

import java.util.*;
import com.lc.nlp.parsedoc.ReadDir;
import com.lc.nlp.parsedoc.ReadFile;


public class TextRankWithTFIDF 
{
	private static int keywordsNumber = 5;
	private static int keywordCandidateNum = 10;
	
	/**
	 * set the number of keywords to extract 
	 * @param number(int): number of keywords to extract 
	 */
	public static void setKeywordsNumber(int number)
	{
		keywordsNumber = number;
		keywordCandidateNum = 2 * number;
	}
	
	 /**
	  * multiply the TextRank-socre of a word by the IDF value of this word in a corpus 
	  * @param dirPath(String): path of the directory of the corpus
 	  * @return(Map<String,List<String>>): keywords of each document of the corpus
	  */
	public static Map<String,List<String>> textRankMultiplyIDF(String dirPath)
	{
		Map<String,List<String>> result = new HashMap<String,List<String>>();
		
		// get the IDF values for the words of a corpus
		Map<String,Float> idfForDir = TFIDF.idfForDir(dirPath);
		List<String> fileList = ReadDir.readDirFileNames(dirPath);
		String content= null;
		
		for(String file:fileList)
		{
			content = ReadFile.loadFile(file);
			Map<String,Float> trKeywords = TextRank.getWordScore("", content);
			Iterator<Map.Entry<String, Float>> it = trKeywords.entrySet().iterator();
			while(it.hasNext())
			{
				Map.Entry<String,Float> temp =it.next();
				String key = temp.getKey();
				trKeywords.put(key, temp.getValue()*idfForDir.get(key));
			}
			
			//sort the words in terms of their score in descending order
			List<Map.Entry<String, Float>> entryList = new ArrayList<Map.Entry<String,Float>>(trKeywords.entrySet());
			Collections.sort(entryList,
					new Comparator<Map.Entry<String, Float>>()
				{
					public int compare(Map.Entry<String, Float> c1, Map.Entry<String, Float> c2)
					{
						return c2.getValue().compareTo(c1.getValue());
					}
					
				}
			);
			
			List<String> temp = new ArrayList<String>();
			for (int i=0;i<keywordsNumber;i++)
			{
				temp.add(entryList.get(i).getKey());
			}
		result.put(file, temp);
		}
		return result;
	}
   
	/**
	 * integrate the results generated by TextRank and TF-IDF, choose those words that co-occure in both 
	 * results, if the number of co-occuring words is not enough, choose the left part from the results of TF-IDF
	 * @param dirPath(String): path of the directory of the corpus
 	 * @return(Map<String,List<String>>): keywords of each document of the corpus
	 */
	public static Map<String,List<String>> textRankTFIDFVote(String dirPath)
	{
		Map<String, List<String>> result = new HashMap<String,List<String>>();
		List<String> fileList = ReadDir.readDirFileNames(dirPath);
		
		// get keywords generated by TF-IDF
		TFIDF.setKeywordsNumber(keywordCandidateNum);
		Map<String,List<String>> tfidfKeywordsForDir = TFIDF.getKeywords(dirPath);
		
		List<String> trKeyword = new ArrayList<String>();
		List<String> tfidfKeyword = new ArrayList<String>();
		String content = null;
		for(String file:fileList)
		{
			
			content = ReadFile.loadFile(file); 
			trKeyword = TextRank.getKeyword("", content);
			tfidfKeyword = tfidfKeywordsForDir.get(file);
			
			List<String> temp = new ArrayList<String>();
			for(String keyword:tfidfKeyword)
			{
				if (trKeyword.contains(keyword))
					temp.add(keyword);
				if (temp.size()== keywordsNumber)
					break;
			}
			if (temp.size()== keywordsNumber)
				result.put(file,temp);
			else
				for(String keyword:tfidfKeyword)
				{
					if (!temp.contains(keyword))
						temp.add(keyword);
				    if (temp.size()==keywordsNumber)
				    	result.put(file, temp);
				}
		}
		return result;
	}

}
